source("./load_files.R")

1 Load and Process data

rafael <-
  load_cdr_rafael() %>% 
  filter(rich == "rich")

thais <- 
  load_cdr_thais() %>% 
  filter(rich == "rich")

carol <- 
  load_cdr_carol() %>% 
  filter(rich == "rich")

cdr <- 
  full_join(rafael, thais) %>% 
  full_join(carol)

glimpse(cdr)
## Rows: 4,271
## Columns: 46
## Groups: expgroup, cycle, time [14]
## $ cdr3      <chr> "GGVNWNDQ", "APAGREFDY", "PLTGLHY", "VQESWGLIDS", "DPGHGDDY…
## $ cycle     <chr> "R4", "R4", "R4", "R4", "R4", "R4", "R4", "R4", "R4", "R4",…
## $ time      <chr> "final", "final", "final", "final", "final", "final", "fina…
## $ expgroup  <chr> "rafael", "rafael", "rafael", "rafael", "rafael", "rafael",…
## $ cdrp      <dbl> 0.011217659, 0.004559435, 0.004197576, 0.003709065, 0.00351…
## $ quantity  <int> 620, 252, 232, 205, 194, 186, 178, 337, 155, 153, 303, 151,…
## $ fcp       <dbl> 252.05518, 204.89647, 188.63485, 166.68165, 157.73776, 151.…
## $ fcq       <dbl> 310.0, 252.0, 232.0, 205.0, 194.0, 186.0, 178.0, 168.5, 155…
## $ length    <int> 8, 9, 7, 10, 8, 15, 14, 7, 10, 9, 7, 7, 9, 6, 11, 10, 20, 1…
## $ MW        <dbl> 888.8806, 1025.0717, 799.9134, 1133.2079, 874.8078, 1833.93…
## $ AV        <dbl> 0.1250, 0.2222, 0.1429, 0.1000, 0.1250, 0.1333, 0.2143, 0.2…
## $ IP        <dbl> 4.0500, 4.3704, 7.1654, 4.0500, 4.0500, 4.7241, 4.5365, 4.0…
## $ flex      <dbl> 0.7795, 0.7601, 0.7171, 0.7599, 0.8070, 0.7617, 0.7513, 0.7…
## $ gravy     <dbl> -1.4375, -0.9333, 0.0571, -0.0900, -2.1750, -1.2400, -0.992…
## $ SSF_Helix <dbl> 0.2500, 0.2222, 0.4286, 0.4000, 0.1250, 0.2000, 0.2857, 0.2…
## $ SSF_Turn  <dbl> 0.5000, 0.2222, 0.2857, 0.3000, 0.3750, 0.1333, 0.2143, 0.2…
## $ SSF_Sheet <dbl> 0.0000, 0.3333, 0.2857, 0.2000, 0.0000, 0.2667, 0.1429, 0.2…
## $ n_A       <int> 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ n_C       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ n_D       <int> 1, 1, 0, 1, 3, 2, 2, 0, 1, 2, 1, 2, 1, 0, 3, 1, 1, 4, 0, 1,…
## $ n_E       <int> 0, 1, 0, 1, 0, 2, 1, 2, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,…
## $ n_F       <int> 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1,…
## $ n_G       <int> 2, 1, 1, 1, 2, 0, 2, 1, 3, 1, 2, 0, 2, 0, 0, 2, 5, 1, 2, 0,…
## $ n_H       <int> 0, 0, 1, 0, 1, 2, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ n_I       <int> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ n_K       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ n_L       <int> 0, 0, 2, 1, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 2, 1, 1, 0, 0, 1,…
## $ n_M       <int> 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,…
## $ n_N       <int> 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,…
## $ n_P       <int> 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,…
## $ n_Q       <int> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ n_R       <int> 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 2, 1, 0, 0, 1, 0, 3, 1, 0, 0,…
## $ n_S       <int> 0, 0, 0, 2, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 3, 4, 3, 1, 3,…
## $ n_T       <int> 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 0,…
## $ n_V       <int> 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 2, 0, 0,…
## $ n_W       <int> 1, 0, 0, 1, 0, 0, 2, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ n_Y       <int> 0, 1, 1, 0, 1, 1, 0, 1, 4, 0, 0, 0, 1, 0, 1, 2, 4, 0, 2, 1,…
## $ aliphatic <int> 3, 4, 4, 4, 3, 4, 5, 2, 3, 3, 2, 1, 4, 2, 3, 4, 8, 3, 2, 2,…
## $ aromatic  <int> 1, 2, 1, 1, 1, 2, 3, 2, 4, 1, 1, 2, 2, 1, 2, 2, 4, 1, 2, 2,…
## $ neutral   <int> 3, 0, 1, 3, 0, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4, 4, 3, 3,…
## $ positive  <int> 0, 1, 1, 0, 1, 3, 2, 0, 0, 2, 2, 1, 0, 0, 1, 0, 3, 1, 1, 1,…
## $ negative  <int> 1, 2, 0, 2, 3, 4, 3, 2, 2, 2, 1, 2, 2, 1, 3, 1, 1, 5, 0, 2,…
## $ invalid   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ file      <chr> "brigido_rafael_Nestor_resultado_nestor_VH_FinalRound_rafaC…
## $ threshold <dbl> 0.293833, 0.293833, 0.293833, 0.293833, 0.293833, 0.293833,…
## $ rich      <fct> rich, rich, rich, rich, rich, rich, rich, rich, rich, rich,…

2 Clustering Methods

2.1 PCA

library(tidytext)

set.seed(42)
pca_rec <- 
  recipe( ~ ., data = cdr) %>% 
  step_rm(file, time, threshold, rich) %>% 
  step_rm(quantity, cdrp, fcp, fcq) %>% 
  update_role(all_nominal(), new_role = "id") %>% 
  step_string2factor(all_nominal()) %>%
  step_nzv(all_numeric()) %>% 
  step_corr(all_numeric()) %>% 
  step_normalize(all_predictors()) %>% 
  step_pca(all_predictors())

pca_prep <- prep(pca_rec)

pca_prep
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         id          6
##  predictor         40
## 
## Training data contained 4271 data points and no missing data.
## 
## Operations:
## 
## Variables removed file, time, threshold, rich [trained]
## Variables removed quantity, cdrp, fcp, fcq [trained]
## Factor variables from cdr3, cycle, expgroup [trained]
## Sparse, unbalanced variable filter removed n_C, invalid [trained]
## Correlation filter removed MW [trained]
## Centering and scaling for length, AV, IP, flex, gravy, ... [trained]
## PCA extraction with length, AV, IP, flex, gravy, SSF_Helix, ... [trained]
juice(pca_prep) %>% 
  ggplot(aes(PC1, PC2, label = cdr3)) +
  geom_point(aes(color = expgroup)) +
  geom_text(check_overlap = T, hjust = "inward", family = "IBMPlexSans") +
  labs(color = NULL)

library(plotly)
juice(pca_prep) %>% 
  plot_ly(
    x = .$PC1,
    y = .$PC2,
    z = .$PC3,
    type = "scatter3d",
    mode = "markers",
    color = .$expgroup
  )
# clean data
rm(pca_prep, pca_rec)

2.2 KPCA Poly

library(tidytext)

set.seed(42)
kpca_poly_prep <- 
  recipe( ~ ., data = cdr) %>% 
  step_rm(file, time, threshold, rich) %>% 
  step_rm(quantity, cdrp, fcp, fcq) %>% 
  update_role(all_nominal(), new_role = "id") %>% 
  step_string2factor(all_nominal()) %>%
  step_nzv(all_numeric()) %>% 
  step_corr(all_numeric()) %>% 
  step_normalize(all_predictors()) %>% 
  step_kpca_poly(all_predictors())

kpca_poly_prep <- prep(kpca_poly_prep)

kpca_poly_prep
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         id          6
##  predictor         40
## 
## Training data contained 4271 data points and no missing data.
## 
## Operations:
## 
## Variables removed file, time, threshold, rich [trained]
## Variables removed quantity, cdrp, fcp, fcq [trained]
## Factor variables from cdr3, cycle, expgroup [trained]
## Sparse, unbalanced variable filter removed n_C, invalid [trained]
## Correlation filter removed MW [trained]
## Centering and scaling for length, AV, IP, flex, gravy, ... [trained]
## Polynomial kernel PCA (polydot) extraction with length, AV, IP, flex, ... [trained]
juice(kpca_poly_prep)
## # A tibble: 4,271 x 8
##    cdr3            cycle expgroup   kPC1  kPC2   kPC3   kPC4   kPC5
##    <fct>           <fct> <fct>     <dbl> <dbl>  <dbl>  <dbl>  <dbl>
##  1 GGVNWNDQ        R4    rafael    213.  181.    20.9  346.  -361. 
##  2 APAGREFDY       R4    rafael    187.  136.  -106.  -199.   -26.4
##  3 PLTGLHY         R4    rafael    -10.0 173.   -16.8 -320.   154. 
##  4 VQESWGLIDS      R4    rafael    176.  129.  -143.  -205.  -112. 
##  5 DPGHGDDY        R4    rafael    144.  220.   -41.5  635.  -337. 
##  6 DREVTEAHYHPMFDS R4    rafael   -956.  -55.2  -94.3 -216.  -378. 
##  7 DIRWEMGTGHWFDP  R4    rafael   -151.   45.8 -186.  -184.  -199. 
##  8 ETWGPEY         R4    rafael    196.   41.9  -74.9   90.5 -168. 
##  9 EDGYSYGYGY      R4    rafael    140.  -51.0   33.9 -246.    15.4
## 10 DLHWGSVDH       R4    rafael    163.   82.7  -54.1 -162.  -257. 
## # … with 4,261 more rows
juice(kpca_poly_prep) %>% 
  ggplot(aes(kPC1, kPC2, label = cdr3)) +
  geom_point(aes(color = expgroup)) +
  geom_text(check_overlap = T, hjust = "inward", family = "IBMPlexSans") +
  labs(color = NULL)

library(plotly)
juice(kpca_poly_prep) %>% 
  plot_ly(
    x = .$kPC1,
    y = .$kPC2,
    z = .$kPC3,
    type = "scatter3d",
    mode = "markers",
    color = .$expgroup
  )
# clean data
rm(kpca_poly_prep, kpca_rec)

2.3 KPCA RBF

library(tidytext)

set.seed(42)
kpca_rbf_prep <- 
  recipe( ~ ., data = cdr) %>% 
  step_rm(file, time, threshold, rich) %>% 
  step_rm(quantity, cdrp, fcp, fcq) %>% 
  update_role(all_nominal(), new_role = "id") %>% 
  step_string2factor(all_nominal()) %>%
  step_nzv(all_numeric()) %>% 
  step_corr(all_numeric()) %>% 
  step_normalize(all_predictors()) %>% 
  step_kpca_rbf(all_predictors())

kpca_rbf_prep <- prep(kpca_rbf_prep)

kpca_rbf_prep
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         id          6
##  predictor         40
## 
## Training data contained 4271 data points and no missing data.
## 
## Operations:
## 
## Variables removed file, time, threshold, rich [trained]
## Variables removed quantity, cdrp, fcp, fcq [trained]
## Factor variables from cdr3, cycle, expgroup [trained]
## Sparse, unbalanced variable filter removed n_C, invalid [trained]
## Correlation filter removed MW [trained]
## Centering and scaling for length, AV, IP, flex, gravy, ... [trained]
## RBF kernel PCA (rbfdot) extraction with length, AV, IP, flex, ... [trained]
juice(kpca_rbf_prep)
## # A tibble: 4,271 x 8
##    cdr3            cycle expgroup   kPC1  kPC2   kPC3    kPC4   kPC5
##    <fct>           <fct> <fct>     <dbl> <dbl>  <dbl>   <dbl>  <dbl>
##  1 GGVNWNDQ        R4    rafael    0.962 -1.63 -0.412 -0.751  -0.137
##  2 APAGREFDY       R4    rafael   -0.115 -1.05  0.255 -1.24   -3.19 
##  3 PLTGLHY         R4    rafael    0.686 -1.55  1.12  -0.290   0.298
##  4 VQESWGLIDS      R4    rafael    0.161 -1.25 -0.252 -0.599  -0.118
##  5 DPGHGDDY        R4    rafael   -0.698 -1.17 -0.439 -0.783  -1.32 
##  6 DREVTEAHYHPMFDS R4    rafael    0.865 -1.40 -0.482 -0.509   0.492
##  7 DIRWEMGTGHWFDP  R4    rafael    0.865 -1.42 -0.457 -0.530   0.481
##  8 ETWGPEY         R4    rafael    0.818 -1.73 -0.551 -2.24   -6.78 
##  9 EDGYSYGYGY      R4    rafael    0.905 14.7   0.244  3.10    0.928
## 10 DLHWGSVDH       R4    rafael    0.696 -2.04  3.16  -0.0690  0.577
## # … with 4,261 more rows
juice(kpca_rbf_prep) %>% 
  ggplot(aes(kPC1, kPC2, label = cdr3)) +
  geom_point(aes(color = expgroup)) +
  geom_text(check_overlap = T, hjust = "inward", family = "IBMPlexSans") +
  labs(color = NULL)

library(plotly)
juice(kpca_rbf_prep) %>% 
  plot_ly(
    x = .$kPC1,
    y = .$kPC2,
    z = .$kPC3,
    type = "scatter3d",
    mode = "markers",
    color = .$expgroup
  )
# clean data
rm(kpca_rbf_prep, kpca_rec)

2.4 ICA

library(tidytext)

set.seed(42)
ica_rec <- 
  recipe( ~ ., data = cdr) %>% 
  step_rm(file, time, threshold, rich) %>% 
  step_rm(quantity, cdrp, fcp, fcq) %>% 
  update_role(all_nominal(), new_role = "id") %>% 
  step_string2factor(all_nominal()) %>%
  step_nzv(all_numeric()) %>% 
  step_corr(all_numeric()) %>% 
  step_normalize(all_predictors()) %>% 
  step_ica(all_predictors())

ica_prep <- prep(ica_rec)

ica_prep
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         id          6
##  predictor         40
## 
## Training data contained 4271 data points and no missing data.
## 
## Operations:
## 
## Variables removed file, time, threshold, rich [trained]
## Variables removed quantity, cdrp, fcp, fcq [trained]
## Factor variables from cdr3, cycle, expgroup [trained]
## Sparse, unbalanced variable filter removed n_C, invalid [trained]
## Correlation filter removed MW [trained]
## Centering and scaling for length, AV, IP, flex, gravy, ... [trained]
## ICA extraction with length, AV, IP, flex, gravy, SSF_Helix, ... [trained]
juice(ica_prep)
## # A tibble: 4,271 x 8
##    cdr3            cycle expgroup    IC1    IC2    IC3     IC4    IC5
##    <fct>           <fct> <fct>     <dbl>  <dbl>  <dbl>   <dbl>  <dbl>
##  1 GGVNWNDQ        R4    rafael   -0.259 -1.39  -0.135 -0.945   0.797
##  2 APAGREFDY       R4    rafael   -0.308 -0.536  0.407 -0.0471 -1.10 
##  3 PLTGLHY         R4    rafael    0.865 -0.635 -1.22  -0.125  -1.01 
##  4 VQESWGLIDS      R4    rafael   -0.666 -0.251 -0.108 -0.652  -0.429
##  5 DPGHGDDY        R4    rafael   -0.624 -1.72   0.994 -0.452   0.156
##  6 DREVTEAHYHPMFDS R4    rafael    0.384  0.176  2.41   0.338  -1.33 
##  7 DIRWEMGTGHWFDP  R4    rafael    0.315  0.414  1.37   0.286  -0.563
##  8 ETWGPEY         R4    rafael   -1.08  -1.26   0.222  0.136  -0.563
##  9 EDGYSYGYGY      R4    rafael   -0.930 -0.338 -0.691  0.519   0.932
## 10 DLHWGSVDH       R4    rafael    0.550 -0.951  0.298  0.355  -0.739
## # … with 4,261 more rows
juice(ica_prep) %>% 
  ggplot(aes(IC1, IC2, label = cdr3)) +
  geom_point(aes(color = expgroup)) +
  geom_text(check_overlap = T, hjust = "inward", family = "IBMPlexSans") +
  labs(color = NULL)

library(plotly)
juice(ica_prep) %>% 
  plot_ly(
    x = .$IC1,
    y = .$IC2,
    z = .$IC3,
    type = "scatter3d",
    mode = "markers",
    color = .$expgroup
  )
# clean data
rm(ica_prep, ica_rec)

2.5 UMAP

library(embed)

set.seed(42)
umap_rec <- 
  recipe( ~ ., data = cdr) %>% 
  step_rm(file, time, threshold, rich) %>% 
  step_rm(quantity, cdrp, fcp, fcq) %>% 
  update_role(all_nominal(), new_role = "id") %>% 
  step_string2factor(all_nominal()) %>%
  step_nzv(all_numeric()) %>% 
  step_corr(all_numeric()) %>% 
  step_normalize(all_predictors()) %>% 
  step_umap(all_predictors(), num_comp = 3,
            options = list(
              verbose = T,
              n_threads = parallel::detectCores()
            ))

umap_prep <- prep(umap_rec)

umap_prep
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         id          6
##  predictor         40
## 
## Training data contained 4271 data points and no missing data.
## 
## Operations:
## 
## Variables removed file, time, threshold, rich [trained]
## Variables removed quantity, cdrp, fcp, fcq [trained]
## Factor variables from cdr3, cycle, expgroup [trained]
## Sparse, unbalanced variable filter removed n_C, invalid [trained]
## Correlation filter removed MW [trained]
## Centering and scaling for length, AV, IP, flex, gravy, ... [trained]
## UMAP embedding for length, AV, IP, flex, gravy, ... [trained]
juice(umap_prep) %>% 
  ggplot(aes(umap_1, umap_2, label = cdr3)) +
  geom_point(aes(color = expgroup)) +
  geom_text(check_overlap = T, hjust = "inward", family = "IBMPlexSans") +
  labs(color = NULL)

library(plotly)
juice(umap_prep) %>% 
  plot_ly(
    x = .$umap_1,
    y = .$umap_2,
    z = .$umap_3,
    type = "scatter3d",
    mode = "markers",
    color = .$expgroup
  )
# clean data
rm(umap_prep, umap_rec)

2.6 T-sne

2.6.1 Comparison Between Groups

library(Rtsne)

tsne_df <-
  cdr %>% 
  ungroup() %>% 
  select(cdr3, expgroup, cycle, !where(is.character)) %>%
  select(!c(which(apply(., 2, var)==0))) %>% 
  select(!c("quantity", "cdrp", "fcp", "fcq", "rich", "threshold")) %>% 
  mutate(across(where(is.character), as.factor))

set.seed(42)
tsne_out <- 
  tsne_df %>% 
  unique() %>% 
    Rtsne(
      X = .,
      dims = 3,
      perplexity = 30,
      theta = 0.5,
      max_iter = 1E3,
      verbose = T,
      pca_center = T,
      pca_scale = T,
      normalize = T,
      partial_pca = T,
      eta = 200.0,
      exaggeration_factor = 12.0,
      num_threads = parallel::detectCores()
      )
## Performing PCA
## Read the 4271 x 50 data matrix successfully!
## OpenMP is working. 8 threads.
## Using no_dims = 3, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 2.39 seconds (sparsity = 0.032983)!
## Learning embedding...
## Iteration 50: error is 86.293961 (50 iterations in 27.57 seconds)
## Iteration 100: error is 85.775593 (50 iterations in 9.25 seconds)
## Iteration 150: error is 85.535944 (50 iterations in 4.76 seconds)
## Iteration 200: error is 85.491334 (50 iterations in 4.92 seconds)
## Iteration 250: error is 85.488876 (50 iterations in 5.84 seconds)
## Iteration 300: error is 2.804065 (50 iterations in 3.87 seconds)
## Iteration 350: error is 2.410835 (50 iterations in 3.89 seconds)
## Iteration 400: error is 2.238190 (50 iterations in 4.06 seconds)
## Iteration 450: error is 2.138289 (50 iterations in 4.48 seconds)
## Iteration 500: error is 2.074880 (50 iterations in 5.23 seconds)
## Iteration 550: error is 2.032770 (50 iterations in 5.29 seconds)
## Iteration 600: error is 2.002685 (50 iterations in 5.71 seconds)
## Iteration 650: error is 1.980768 (50 iterations in 5.65 seconds)
## Iteration 700: error is 1.964674 (50 iterations in 5.53 seconds)
## Iteration 750: error is 1.951831 (50 iterations in 5.38 seconds)
## Iteration 800: error is 1.942696 (50 iterations in 5.50 seconds)
## Iteration 850: error is 1.936658 (50 iterations in 5.35 seconds)
## Iteration 900: error is 1.931697 (50 iterations in 5.39 seconds)
## Iteration 950: error is 1.926850 (50 iterations in 6.20 seconds)
## Iteration 1000: error is 1.922362 (50 iterations in 5.35 seconds)
## Fitting performed in 129.22 seconds.
cdr %>% 
  ungroup() %>% 
  select(cdr3, expgroup, cycle, rich, !where(is.character)) %>%
  select(!c(which(apply(., 2, var)==0))) %>% 
  select(!c("quantity", "cdrp", "fcp", "fcq", "threshold")) -> a

tsne_out %>% 
  .$Y %>% 
  as_tibble() %>% 
  ggplot() +
    geom_point(aes(V1, V2, color = a$expgroup))

tsne_out %>% 
  .$Y %>% 
  as_tibble() %>% 
  plot_ly(
    x = .$V1,
    y = .$V2,
    z = .$V3,
    type = "scatter3d",
    mode = "markers",
    color = a$expgroup
  ) %>% 
  layout(title = "Comparison Between Experiments")

2.6.2 Without Groups

library(Rtsne)

tsne_df <-
  cdr %>% 
  ungroup() %>% 
  select(!where(is.character)) %>% 
  # select(cdr3, expgroup, cycle, !where(is.character)) %>% 
  select(!c(which(apply(., 2, var)==0))) %>% 
  select(!c("quantity", "cdrp", "fcp", "fcq", "rich", "threshold")) %>% 
  mutate(across(where(is.character), as.factor))

set.seed(42)
tsne_out <- 
  tsne_df %>% 
  unique() %>% 
    Rtsne(
      X = .,
      dims = 3,
      perplexity = 30,
      theta = 0.5,
      max_iter = 1E3,
      verbose = T,
      pca_center = T,
      pca_scale = T,
      normalize = T,
      # partial_pca = T,
      eta = 200.0,
      exaggeration_factor = 12.0,
      num_threads = parallel::detectCores()
      )
## Performing PCA
## Read the 3849 x 34 data matrix successfully!
## OpenMP is working. 8 threads.
## Using no_dims = 3, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 1.67 seconds (sparsity = 0.034859)!
## Learning embedding...
## Iteration 50: error is 85.209017 (50 iterations in 11.08 seconds)
## Iteration 100: error is 85.074387 (50 iterations in 9.01 seconds)
## Iteration 150: error is 85.065576 (50 iterations in 6.01 seconds)
## Iteration 200: error is 85.062826 (50 iterations in 5.59 seconds)
## Iteration 250: error is 85.061685 (50 iterations in 5.05 seconds)
## Iteration 300: error is 3.107545 (50 iterations in 5.87 seconds)
## Iteration 350: error is 2.661810 (50 iterations in 4.22 seconds)
## Iteration 400: error is 2.481345 (50 iterations in 4.26 seconds)
## Iteration 450: error is 2.381366 (50 iterations in 5.19 seconds)
## Iteration 500: error is 2.318935 (50 iterations in 4.33 seconds)
## Iteration 550: error is 2.278227 (50 iterations in 4.46 seconds)
## Iteration 600: error is 2.250068 (50 iterations in 4.42 seconds)
## Iteration 650: error is 2.229087 (50 iterations in 4.49 seconds)
## Iteration 700: error is 2.214116 (50 iterations in 4.65 seconds)
## Iteration 750: error is 2.202790 (50 iterations in 4.33 seconds)
## Iteration 800: error is 2.194687 (50 iterations in 4.51 seconds)
## Iteration 850: error is 2.188096 (50 iterations in 5.57 seconds)
## Iteration 900: error is 2.182072 (50 iterations in 4.58 seconds)
## Iteration 950: error is 2.177503 (50 iterations in 4.35 seconds)
## Iteration 1000: error is 2.172849 (50 iterations in 4.51 seconds)
## Fitting performed in 106.49 seconds.
cdr %>% 
  ungroup() %>% 
  select(!where(is.character)) %>% 
  # select(cdr3, expgroup, cycle, rich, !where(is.character)) %>% 
  select(!c(which(apply(., 2, var)==0))) %>% 
  select(!c("quantity", "cdrp", "fcp", "fcq", "threshold")) -> a

tsne_out %>% 
  .$Y %>% 
  as_tibble() %>% 
  ggplot() +
    geom_point(aes(V1, V2, color = a$expgroup))

tsne_out %>% 
  .$Y %>% 
  as_tibble() %>% 
  plot_ly(
    x = .$V1,
    y = .$V2,
    z = .$V3,
    type = "scatter3d",
    mode = "markers",
    color = I("blue"),
    alpha = .1
  ) %>% 
  layout(title = "All experiments together")